Loading libraries

library(EnhancedVolcano, verbose = FALSE)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(GEOquery, verbose = FALSE)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma, verbose = FALSE)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(umap, verbose = FALSE)
library(dplyr, verbose = FALSE)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1. Differential expression analysis of reference airway current vs never smoker dataset (A1, GSE63127)

1.1 Loading dataset

# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
#   Differential expression analysis with limma

# load series and platform data from GEO (date: 2024/10/15)
gset <- getGEO("GSE63127", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE63127_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group membership for all samples
gsms <- paste0("X00100011111X00000000000X00000X000000000000X000X00",
               "XX00X0XXXXXXXXX1111111111111111111111X111X11111111",
               "XXXX1XXXXXXXXXXXXXXXXXXXXXXXX001000000010100111111",
               "01110111110011111001110011101011111001110101100011",
               "111111111111111111111111111111")
sml <- strsplit(gsms, split="")[[1]]

# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
length(sml) # 182 samples
## [1] 182
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Non-smoker","Smoker"))
levels(gs) <- groups
gset$group <- gs

# Checking how many samples were excluded
A1_samples <- strsplit("X00100011111X00000000000X00000X000000000000X000X00XX00X0XXXXXXXXX1111111111111111111111X111X11111111XXXX1XXXXXXXXXXXXXXXXXXXXXXXX00100000001010011111101110111110011111001110011101011111001110101100011111111111111111111111111111111", "")[[1]]
length(A1_samples[A1_samples=="X"])
## [1] 48

1.2 Quality control checks and normalization

## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization 

## Histogram ##
hist(as.matrix(exprs(gset)),
     main = "GSE63127 overall distribution of expression values",
     xlab = "RMA normalized probe intensity",
     ylab = "Frequency") # skewed left, needs log2 transform

# boxplot(exprs(gset),)
# It takes up lots of computation so commenting out for now

max(exprs(gset))
## [1] 136808
min(exprs(gset))
## [1] 0.0657913
# Should do log2 normalization

## log2 normalization ##
exprs(gset) <- log2(exprs(gset)+1)

# quantile normalization: no longer doing this to better capture variation
## exprs(gset) <- normalizeBetweenArrays(exprs(gset)) 

hist(as.matrix(exprs(gset)), 
     main = "GSE63127 distribution of expression values",
     xlab = "RMA and log2 normalized probe intensity",
     ylab = "Frequency") # much better

# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE63127", "/", annotation(gset), " value distribution", sep ="")
plotDensities(exprs(gset), group=gs, main=title, legend ="topright")

### Generate a boxplot ###

ex <- exprs(gset)
ord <- order(gs)  # order samples by group

palette(
  c("#7570B3", "#1B9E77")
  )
par(mar=c(7,4,2,1))
title <- "GSE63127 sample-wise distribution of expression values"
boxplot(ex[,ord], boxwex=0.6, notch=T,  outline=FALSE, las=2, col=gs[ord],
        main=title, 
        #xlab = "Samples", 
        ylab = "RMA and log2 normalized probe intensity",
        xaxt="n",
        whisklty = 0,
        #border = gs[ord]
        lwd = 0.8,
        space = -1
        )

# Add axis labels for clarification
mtext("Samples", side=1, line=2)  # Add "Samples" as the x-axis label
# Add legend
legend("bottomleft", inset=c(0, -0.3), fill=palette(), legend=groups, 
       bty="n", xpd=TRUE)  # Adjust inset and allow drawing outside plot area

min(exprs(gset))
## [1] 0.09192496
max(exprs(gset))
## [1] 17.0618

1.3 Checking and correcting batch effect / sources of variation

Download and clean up the phenotypic information table from the dataset

phenotypic_data <- pData(gset)  # Extract phenotypic data
head(phenotypic_data, n = 3)
##                                   title geo_accession                status
## GSM190150 Small airways, non-smoker 029     GSM190150 Public on Dec 16 2008
## GSM190153 Small airways, non-smoker 036     GSM190153 Public on Jun 17 2008
## GSM254157     small airways, smoker 112     GSM254157 Public on Jun 17 2008
##           submission_date last_update_date type channel_count
## GSM190150     May 17 2007      Aug 28 2018  RNA             1
## GSM190153     May 17 2007      Aug 28 2018  RNA             1
## GSM254157     Jan 03 2008      Aug 28 2018  RNA             1
##                                                         source_name_ch1
## GSM190150 airway epithelial cells obtained by bronchoscopy and brushing
## GSM190153 airway epithelial cells obtained by bronchoscopy and brushing
## GSM254157 airway epithelial cells obtained by bronchoscopy and brushing
##           organism_ch1 characteristics_ch1 characteristics_ch1.1
## GSM190150 Homo sapiens             age: 34                sex: M
## GSM190153 Homo sapiens             age: 45                sex: F
## GSM254157 Homo sapiens             age: 45                sex: M
##            characteristics_ch1.2                 characteristics_ch1.3
## GSM190150    ethnic group: black            smoking status: non-smoker
## GSM190153 ethnic group: hispanic            smoking status: non-smoker
## GSM254157    ethnic group: white smoking status: smoker, 23 pack-years
##           molecule_ch1
## GSM190150    total RNA
## GSM190153    total RNA
## GSM254157    total RNA
##                                                                                                      extract_protocol_ch1
## GSM190150 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM190153 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM254157 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
##           label_ch1
## GSM190150    biotin
## GSM190153    biotin
## GSM254157    biotin
##                                                                                                                                                                  label_protocol_ch1
## GSM190150   Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM190153   Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM254157 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
##           taxid_ch1
## GSM190150      9606
## GSM190153      9606
## GSM254157      9606
##                                                                                                                                                                                  hyb_protocol
## GSM190150 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM190153 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM254157 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
##                                                        scan_protocol
## GSM190150 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM190153 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM254157 GeneChips were scanned using the GeneChip Scanner 3000 7G.
##                         description
## GSM190150 small airways, non-smoker
## GSM190153 small airways, non-smoker
## GSM254157 small airways, smoker 112
##                                                                                                                                                     data_processing
## GSM190150 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM190153 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM254157 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
##           platform_id           contact_name           contact_email
## GSM190150      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM190153      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM254157      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
##           contact_laboratory             contact_department
## GSM190150            Crystal Department of Genetic Medicine
## GSM190153            Crystal Department of Genetic Medicine
## GSM254157            Crystal Department of Genetic Medicine
##                       contact_institute  contact_address contact_city
## GSM190150 Weill Cornell Medical College 1300 York Avenue     New York
## GSM190153 Weill Cornell Medical College 1300 York Avenue     New York
## GSM254157 Weill Cornell Medical College 1300 York Avenue     New York
##           contact_state contact_zip/postal_code contact_country
## GSM190150            NY                   10021             USA
## GSM190153            NY                   10021             USA
## GSM254157            NY                   10021             USA
##                                                                          supplementary_file
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CEL.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CEL.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CEL.gz
##                                                                        supplementary_file.1
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CHP.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CHP.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CHP.gz
##           data_row_count                 relation               relation.1
## GSM190150          54675 Reanalyzed by: GSE119087                         
## GSM190153          54675  Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## GSM254157          54675  Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
##           age:ch1 cilia length:ch1 copd status:ch1
## GSM190150      34             <NA>            <NA>
## GSM190153      45             <NA>            <NA>
## GSM254157      45             <NA>            <NA>
##           department of genetic medicine id:ch1 ethnic group:ch1 ethnicity:ch1
## GSM190150                                  <NA>            black          <NA>
## GSM190153                                  <NA>         hispanic          <NA>
## GSM254157                                  <NA>            white          <NA>
##           serum 25-oh-d:ch1 sex:ch1    smoking status:ch1      group
## GSM190150              <NA>       M            non-smoker Non.smoker
## GSM190153              <NA>       F            non-smoker Non.smoker
## GSM254157              <NA>       M smoker, 23 pack-years     Smoker
# This is filtered down to the samples that were included.
# I will first try to clean up the phenotypic data.

# The features I want to keep are:
# Dates of submission/updates etc, sex, ethnicity, vitamin D, smoking status, cilia length
# Will not use most of the information but keeping it for the sake of interest)
# Keep the columns that might contain data of interest, which will need to be cleaned up.

# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("geo_accession","status","submission_date","last_update_date","characteristics_ch1","characteristics_ch1.1","characteristics_ch1.2","characteristics_ch1.3","age:ch1","cilia length:ch1","ethnic group:ch1","ethnicity:ch1","serum 25-oh-d:ch1","sex:ch1","smoking status:ch1","group")

# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)

phenotypic_data <- phenotypic_data[,c(indexes)]

# Now I need to parse out sex, ethnicity, smoking status, and age, vitamin D, pack years.

#Rename "group" as "smoking status"
names(phenotypic_data)[16] <- "smoking_status"

## Grabbing ethnicity values from the columns ##
# Initialize a new column "ethnicity" with NA values
phenotypic_data$ethnicity <- NA

# Function to find 'eth' in a row and return the corresponding value
find_ethnicity <- function(row) {
  eth_column <- which(grepl('eth', row))
  if (length(eth_column) > 0) {
    return(row[eth_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "ethnicity" column
phenotypic_data$ethnicity <- apply(phenotypic_data, 1, find_ethnicity)

## Grabbing sex values from the columns ##
# Initialize a new column "sex" with NA values
phenotypic_data$sex <- NA

# Function to find 'sex' in a row and return the corresponding value
find_sex <- function(row) {
  sex_column <- which(grepl('sex', row))
  if (length(sex_column) > 0) {
    return(row[sex_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "sex" column
phenotypic_data$sex <- apply(phenotypic_data, 1, find_sex)


## Grabbing pack_years values from the columns ##
# Initialize a new column "pack_years" with NA values
phenotypic_data$pack_years <- NA

# Function to find 'pack_years' in a row and return the corresponding value, but just the first instance
find_pack_years <- function(row) {
  pack_years_column <- which(grepl('pack', row))
  if (length(pack_years_column) > 0) {
    return(row[pack_years_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "pack_years" column
phenotypic_data$pack_years <- apply(phenotypic_data, 1, find_pack_years)
#unlist the column
phenotypic_data$pack_years <- unlist(phenotypic_data$pack_years )



## Grabbing age values from the columns ##
# Initialize a new column "age" with NA values
phenotypic_data$age <- NA

# Function to find 'age' in a row and return the corresponding value
find_age <- function(row) {
  age_column <- which(grepl('age', row))
  if (length(age_column) > 0) {
    return(row[age_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "age" column
phenotypic_data$age <- apply(phenotypic_data, 1, find_age)


## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA

# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
  vitamin_d_column <- which(grepl('vitamin', row))
  if (length(vitamin_d_column) > 0) {
    return(row[vitamin_d_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)

## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA

# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
  vitamin_d_column <- which(grepl('vitamin', row))
  if (length(vitamin_d_column) > 0) {
    return(row[vitamin_d_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)

## Grabbing cilia values from the columns ##
# Initialize a new column "cilia_length" with NA values
phenotypic_data$cilia_length <- NA

# Function to find 'cilia' in a row and return the corresponding value, first instance
find_cilia <- function(row) {
  cilia_column <- which(grepl('cilia', row))
  if (length(cilia_column) > 0) {
    return(row[cilia_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "cilia" column
phenotypic_data$cilia_length <- apply(phenotypic_data, 1, find_cilia)

## Now cut out the messy columns
phenotypic_data <- phenotypic_data[,-c(5:15)]

## Remove unnecessary prefix info
phenotypic_data$ethnicity <- gsub(".*: ", "", phenotypic_data$ethnicity )
phenotypic_data$age <- gsub(".*: ", "", phenotypic_data$age)
phenotypic_data$sex <- gsub(".*: ", "", phenotypic_data$sex)
phenotypic_data$vitamin_d <- gsub(".*: ", "", phenotypic_data$vitamin_d)
phenotypic_data$cilia_length <- gsub(".*: ", "", phenotypic_data$cilia_length)

phenotypic_data$pack_years<- gsub(".*, ", "", phenotypic_data$pack_years)
phenotypic_data$pack_years<- gsub("pack-years", "", phenotypic_data$pack_years)


# Reformat the submission dates to be sortable

phenotypic_data <- phenotypic_data %>%
  mutate(submission_date = ifelse(submission_date == "Dec 20 2012", "2012-12-20", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jan 03 2008", "2008-01-08", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jan 31 2013", "2013-01-31", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jun 03 2010", "2010-06-03", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jun 13 2008", "2008-06-13", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "May 17 2007", "2007-05-17", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Nov 08 2013", "2013-11-08", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Nov 10 2014", "2014-11-10", submission_date))

head(phenotypic_data, n = 3)
##           geo_accession                status submission_date last_update_date
## GSM190150     GSM190150 Public on Dec 16 2008      2007-05-17      Aug 28 2018
## GSM190153     GSM190153 Public on Jun 17 2008      2007-05-17      Aug 28 2018
## GSM254157     GSM254157 Public on Jun 17 2008      2008-01-08      Aug 28 2018
##           smoking_status ethnicity sex pack_years age vitamin_d cilia_length
## GSM190150     Non.smoker     black   M       <NA>  34      <NA>         <NA>
## GSM190153     Non.smoker  hispanic   F       <NA>  45      <NA>         <NA>
## GSM254157         Smoker     white   M        23   45      <NA>         <NA>

Plot PCA and use phenotypic information to look for sources of batch effect/variation, and correct for these with ComBat

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Non-smoker","Smoker"))
levels(gs) <- groups
gset$group <- gs


## Plot PCA 1 ##
colz <- as.numeric(as.factor(gs)) # Get color values from group

plotMDS(exprs(gset),
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127",
        col = colz,
        pch = 1
)
legend("bottom", 
       legend = c("Smoker", "Non-smoker"), 
       col = c("#7570B3", "#1B9E77"), # Colors: only for smoking status
       pch = c(15, 15),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )                              # Box type: 'n' removes border

## We have 4 definite clusters that are not based on smoking status. 
## As such, it is a good idea to check the table of sample phenotypic information to look for sources of variation between samples.


pointz <- as.numeric(as.factor(phenotypic_data$submission_date<= "2010-06-03")) # Get point shape values from date of submission: split into 2010 and earlier, post-2010]

## Plot PCA with date information##
plotMDS(exprs(gset),
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz
        #labels = gset$group
)
legend("bottom", 
       legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1, 1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )                              # Box type: 'n' removes border

# Clearly the source of batch effect in PC1 is submission date post-2010.
# Note: I found out that the split was at 2010 by doing a bit of experimenting with other clustering methods, not shown here. This PCA simply shows proof that the split occurs at 2010.

# First batch correction (submission date)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(limma)

# Making a batch vector
submission_post_2010_batch <- ifelse(phenotypic_data$submission_date < as.Date("2012-01-01"), 1, 2)

# Adjust the expression matrix for submission date batch effect
exprs_matrix_combat <- ComBat(dat=exprs(gset), batch=submission_post_2010_batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Plot PCA for expression values after first batch correction ##
date_corrected_PCA <- plotMDS(exprs_matrix_combat,
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127, corrected for submission date",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz

)
legend("bottom",
legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1)
       #title = "Smoking status"
       )

## Some evidence that second source of variation could be due to sex (but only 11/182 samples have sex labels):
plotMDS(exprs_matrix_combat,
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127, corrected for submission date",
        col = colz, # Colors smokers red and nonsmokers black
        #pch = pointz2 # Using separate shapes for all submission dates
        labels = phenotypic_data$sex
)
legend("bottom",
       legend = c("Smoker", "Non-smoker", "M = Male", "F = Female"),
       col = c("#7570B3", "#1B9E77", "black", "black"),
       pch = c(15, 15, NA, NA)
       #title = "Smoking status"
       )

## Samples are divided by sex, but 11/182 samples is not enough to draw a definitive conclusion here.

## Second correction for unknown source of variation using ComBat: ##

# Assign batch labels based on the first dimension from MDS (equivalent to PC1), since the dividing line for the batches lies at 0
unknown_batch_labels <- ifelse(date_corrected_PCA$x < 0, 1, 2)

# Do a second batch correction
exprs_matrix_combat_2 <- ComBat(dat=exprs_matrix_combat, batch=unknown_batch_labels, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# View PCA plot
plotMDS(exprs_matrix_combat_2,
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127 after second correction",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz
        #labels = gset$group
)
legend("topleft", 
       legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1, 1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )  

## Now PC1 corresponds quite well to smoking status after the two ComBat corrections.

1.4 Differential expression analysis (limma with vooma)

# Finish setting up the design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)


## Crucial bit: Replace the expression values in gset with the batch corrected ones ##
exprs(gset) <- as.matrix(exprs_matrix_combat_2)

# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

v$genes <- fData(gset) # attach gene annotations


## Making the processed expression data to save
# vooma_expr <- as.data.frame(v[["E"]])
# vooma_expr <- cbind(Gene = fData(gset)$Gene.symbol, vooma_expr)
# write.table(vooma_expr, "../2_Outputs/2_Airway_expression/A1_exprs_20250403.txt")

# fit linear model
fit  <- lmFit(v)

# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)

# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)

tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))

1.5 Basic filtering of DEGs (unlabelled, duplicates)

GSE63127_CS_NS_limma <- tT %>%
  filter(Gene.symbol != "") %>% # Remove blank gene symbols
  group_by(Gene.symbol) %>%
  slice_min(adj.P.Val, with_ties = TRUE) %>% 
  # For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
  ungroup()
head(GSE63127_CS_NS_limma)
## # A tibble: 6 × 4
##   ID          Gene.symbol  logFC adj.P.Val
##   <chr>       <chr>        <dbl>     <dbl>
## 1 229819_at   A1BG        -0.106    0.481 
## 2 232462_s_at A1BG-AS1     0.531    0.0224
## 3 220951_s_at A1CF         0.302    0.123 
## 4 1558450_at  A2M          0.110    0.453 
## 5 1564139_at  A2M-AS1     -0.138    0.0724
## 6 1553505_at  A2ML1        0.145    0.636
# Checking for ties
ties <- GSE63127_CS_NS_limma %>%
  group_by(Gene.symbol) %>%
  filter(n() > 1) %>%
  ungroup()
print(ties) # No ties
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
nrow(GSE63127_CS_NS_limma)
## [1] 22189
# For now we are not applying any FDR or log2FC thresholds, that occurs in script 2.1

1.8 Visualization of DEGs (volcano plot)

FDR_cutoff_A1 <- 0.05
log2FC_cutoff_A1 <- 0.25

v_A1 <- EnhancedVolcano::EnhancedVolcano(
  toptable = GSE63127_CS_NS_limma,
  lab = GSE63127_CS_NS_limma$Gene.symbol,
  x = "logFC", # "mean difference" is estimate here
  y = "adj.P.Val", 
 # pCutoffCol = 'min_smoothed_fdr',
  xlab = "log2FC",
  ylab = "-log10(FDR)",
  title = "A1 DEGs",
  subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_A1, "    FDR cutoff: ", FDR_cutoff_A1),
  caption = paste0("Total = ", nrow(GSE63127_CS_NS_limma[abs(GSE63127_CS_NS_limma$logFC)>log2FC_cutoff_A1 &  GSE63127_CS_NS_limma$adj.P.Val < FDR_cutoff_A1,]), " significant DEGs meeting cutoffs"),
  col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
  legendPosition = "bottom",
  labSize = 3,
  max.overlaps = 3,
  drawConnectors = TRUE,
  widthConnectors = 0.3,
  arrowheads = FALSE,
  pCutoff = FDR_cutoff_A1,
  FCcutoff = log2FC_cutoff_A1,
  gridlines.minor = FALSE,
  gridlines.major = FALSE,
  xlim = c(-3, 6)
)

v_A1
## Warning: ggrepel: 3577 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

1.7 Save outputs

# Change date suffix as appropriate if changes are made
#write.table(GSE63127_CS_NS_limma, "../2_Outputs/1_Airway_DEGs/GSE63127_CS_NS_limma_20250307.txt", sep = '\t')

2. Differential expression analysis of reference reference “persistent” airway current vs former vs never smoker dataset (A2)

2.1 Loading dataset

# load series and platform data from GEO

gset <- getGEO("GSE7895", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE7895_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group membership for all samples
gsms <- paste0("22222222222222222222200000000000000000000000000000",
               "00000000000000000000000111111111111111111111111111",
               "1111")
sml <- strsplit(gsms, split="")[[1]]

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("current_smoker","former_smoker","never_smoker"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values

2.2 Quality control checks and normalization

Initial checks (histogram, boxplot, PCA), and quantile normalization

## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##

hist(as.matrix(exprs(gset))) # Values range 1-15, and 1 big peak around 3. 

boxplot(exprs(gset)) # Same range of values, with similar-looking ranges, but not exactly the same

# Narrow range, therefore no log2 normalization needed

exprs(gset) <- normalizeBetweenArrays(exprs(gset))
boxplot(exprs(gset))

# 2024/11/12: I elected to use quantile normalization

min(exprs(gset))
## [1] -0.2140344
max(exprs(gset))
## [1] 14.59784
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for GSE7895",
        col = colz,
        pch = 1
        #labels = gs
        )

legend("topright", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

# No separation

Investigating sources of variation that could account for lack of separation of samples by smoking status

Extract and format phenotypic data

library(stringr)

phenotypic_data <- pData(gset)  # Extract phenotypic data

# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("characteristics_ch1.1", "group")

# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)

phenotypic_data <- phenotypic_data[,c(indexes)]

# Extract Age
phenotypic_data$age <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Age:)\\d+"))

# Extract Packyears
phenotypic_data$packyears <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Packyears:)\\d+"))

# Extract Time Since Quit Smoking (months)
phenotypic_data$TSQ_months <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Time Since Quit Smoking \\(months\\):)\\d+"))

# Delete the original column with the unseparated info
phenotypic_data <- phenotypic_data[,-1]

# Convert the NA values for packyears for never smokers to zero (this makes sense since the never smokers have 0 pack years)
phenotypic_data$packyears[phenotypic_data$group=="never_smoker"] <- 0

# Convert the NA values for TSQ_months to zero for current smokers (again makes sense)
phenotypic_data$TSQ_months[phenotypic_data$group=="current_smoker"] <- 0

# Make column to denote just former smoking status for the linear model
phenotypic_data$former_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "former_smoker"))

# Make column to denote just current smoking status for the linear model
phenotypic_data$current_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "current_smoker"))

# Make column to denote just never smoking status for the linear model
phenotypic_data$never_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "never_smoker"))

Plot PCA using other phenotypic data

## Plot PCA using age to define color
# Create a gradient color palette (light blue to dark blue)
palette <- colorRampPalette(c("lightblue", "darkblue"))

## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)]  # Map ages to gradient colors
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher age)",
        col = colz_age,
        pch = 1
        )
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age), 
       fill = palette(2), 
       title = "Age")

# Does not seem to be an age effect

### Plot PCA of packyears ###

# Excluding packyears of zero (never smokers)
pheno_packyears <- phenotypic_data[phenotypic_data$packyears!=0,]
exprs_packyears <- as.data.frame(exprs(gset)) %>%
  dplyr::select(rownames(pheno_packyears))

colz_packyears <-  palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_packyears,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher packyears)",
        col = colz_packyears,
        pch = 1
        #labels = gs
        )
# Add a color bar for packyears
legend("topright", legend = range(pheno_packyears$packyears), 
       fill = palette(2), 
       title = "Packyears")

## Does not seem to be packyears effect


### Plot PCA of time since quitting ###
pheno_tsq <- phenotypic_data[!is.na(phenotypic_data$TSQ_months),]
exprs_tsq <- as.data.frame(exprs(gset)) %>%
  dplyr::select(rownames(pheno_tsq))

colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)]  # Map packyears to gradient colors
plotMDS(exprs_tsq,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ more time since quitting)",
        col = colz_TSQ,
        pch = 1
        #labels = gs
        )
legend("topright", legend = range(pheno_tsq$TSQ), 
       fill = palette(2), 
       title = "TSQ")

## Does not seem to be TSQ effect

This did not determine the underlying source of variation explaining sample distribution.

Differential expression analysis

v <- vooma(gset, design, plot=T)

v$genes <- fData(gset) # attach gene annotations


# fit linear model
fit  <- lmFit(v)

# set up contrasts of interest and recalculate model coefficients
#cts <- c(paste(groups[1],"-",groups[2],sep=""), paste(groups[1],"-",groups[3],sep=""), paste(groups[2],"-",groups[3],sep=""))
#cont.matrix <- makeContrasts(contrasts=cts, levels=design)
cont.matrix <- makeContrasts(
  CS_vs_NS = current_smoker - never_smoker,
  FS_vs_NS = former_smoker - never_smoker,
  CS_vs_FS = current_smoker - former_smoker,
  levels = design
)

fit2 <- contrasts.fit(fit, cont.matrix)


# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, proportion = 0.01) # Proportion is "assumed proportion of genes which are differentially expressed"

Select “persistent” DEGs, and basic filter (keep lower FDR of duplicates, apply FDR < 0.05 cutoff)

library(dplyr)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
## 
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
## 
##     scat
## Separate out genes that are DEGS in CS vs NS and FS vs NS

## Note: I have decided not to filter out genes that are significantly different between CS and FS because I realized that doesn't make logical sense.

# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", 
                  p.value=0.05, 
                  lfc=0)
# Note that this already has a p-value cutoff of 0.05 unlike my other datasets, but this is ok because this is the least stringent possible cutoff we will use anyway

# Venn diagram of results
vennDiagram(dT)

# Select the genes differentially expressed in both CS_vs_NS and FS_vs_NS
dT_persistent <- dT %>%
  as.data.frame(.) %>%
  filter(CS_vs_NS != 0) %>% # Differentially expressed in CS vs NS
  filter(CS_vs_NS == FS_vs_NS)# Differentially expressed, same sign in FS vs NS
nrow(dT_persistent) # 128 genes indeed
## [1] 128
# Get the toptable format for all genes
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) # Inf shows all the significant genes

# Filter to the "persistent" genes
tT_persistent <- tT %>%
  filter(ID %in% rownames(dT_persistent))

# Filter out blanks, keep lower FDR of ties
tT_persistent <- tT_persistent %>%
  filter(Gene.symbol != "") %>% # Remove blank gene symbols
  group_by(Gene.symbol) %>%
  slice_min(adj.P.Val, with_ties = TRUE) %>% 
  # For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
  ungroup()
nrow(tT_persistent)
## [1] 116
## TO DO: Change this to filtering out duplicates

# Checking for ties
ties <- tT_persistent%>%
  group_by(Gene.symbol) %>%
  filter(n() > 1) %>%
  ungroup()
print(ties)
## # A tibble: 2 × 28
##   ID      Gene.title Gene.symbol Gene.ID UniGene.title UniGene.symbol UniGene.ID
##   <chr>   <chr>      <chr>       <chr>   <chr>         <chr>          <chr>     
## 1 214303… mucin 5AC… MUC5AC      4586    ""            ""             ""        
## 2 214385… mucin 5AC… MUC5AC      4586    ""            ""             ""        
## # ℹ 21 more variables: Nucleotide.Title <chr>, GI <int>,
## #   GenBank.Accession <chr>, Platform_CLONEID <lgl>, Platform_ORF <lgl>,
## #   Platform_SPOTID <chr>, Chromosome.location <chr>,
## #   Chromosome.annotation <chr>, GO.Function <chr>, GO.Process <chr>,
## #   GO.Component <chr>, GO.Function.ID <chr>, GO.Process.ID <chr>,
## #   GO.Component.ID <chr>, CS_vs_NS <dbl>, FS_vs_NS <dbl>, CS_vs_FS <dbl>,
## #   AveExpr <dbl>, F <dbl>, P.Value <dbl>, adj.P.Val <dbl>
# As there is a tie with MUCA5 I will remove the MUCA5 probe with an "x" label for cross-reactivity
#tT_persistent <- tT_persistent %>% filter (ID != "214303_x_at")

#Pick the columns we care about
GSE7895_persistent_DEGs <- tT_persistent %>%
  dplyr::select(., Gene.symbol, CS_vs_NS, FS_vs_NS, CS_vs_FS, adj.P.Val) %>%
  dplyr::rename(., Gene = Gene.symbol, CS_NS_A2 = CS_vs_NS, FS_NS_A2 = FS_vs_NS, CS_FS_A2 = CS_vs_FS, FDR_A2 = adj.P.Val)

# Save output
#write.table(GSE7895_persistent_DEGs, "../2_Outputs/1_Airway_DEGs/GSE7895_persistent_DEGs_quantile_20250307.txt")

Extra check (PCA for stratification of samples based on persistent genes)

## Filter exprs to the "persistent" genes
exprs_persistent <- as.data.frame(exprs(gset)) %>%
  filter(rownames(.) %in% tT_persistent$ID)

## Plot PCA ##
colz<- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs_persistent,
        gene.selection = "common",
        main = "PCA for GSE7895 with persistent genes",
        col = colz,
        pch = 1
        #labels = gs
        )

legend("topright", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

# You can see more separation happening, but I would expect to see current and former smokers more mixed together, whereas we see former and never smokers more mixed together. Hmm okay, interesting at least.

# Might be good to check on the age, packyears and TSQ here as well?


## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)]  # Map ages to gradient colors
plotMDS(exprs_persistent,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher age)",
        col = colz_age,
        pch = 16
        )
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age), 
       fill = palette(2), 
       title = "Age")

# Does not seem to be an age effect

### Plot PCA of packyears ###

# Excluding packyears of zero (never smokers)
exprs_persistent_packyears <- as.data.frame(exprs_persistent) %>%
  dplyr::select(rownames(pheno_packyears))

colz_packyears <-  palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_persistent_packyears,
        gene.selection = "common",
        main = "PCA for GSE7895 persistent genes (darker blue ~ higher packyears)",
        col = colz_packyears,
        pch = 16
        #labels = gs
        )
# Add a color bar for packyears
legend("bottomleft", legend = range(pheno_packyears$packyears), 
       fill = palette(2), 
       title = "Packyears")

## Maybe some sort of packyears effect happening, not obviously so


### Plot PCA of time since quitting ###
exprs_persistent_tsq <- as.data.frame(exprs_persistent) %>%
  dplyr::select(rownames(pheno_tsq))

colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)]  # Map packyears to gradient colors
plotMDS(exprs_persistent_tsq ,
        gene.selection = "common",
        main = "PCA for GSE7895 persistent genes (darker blue ~ more months since quitting)",
        col = colz_TSQ,
        pch = 16
        #labels = gs
        )
legend("bottomleft", legend = range(pheno_tsq$TSQ), 
       fill = palette(2), 
       title = "TSQ")

## Maybe some TSQ effect but not super obvious